# This script replicates the dictionary tables seen in our 
# appendix as well as the summary table seen in the main paper

# Load prerequisites
rm(list = ls())

library(tidyr)
library(dplyr)
library(purrr)
library(data.table)
library(ggplot2)
library(readxl)
library(rmarkdown)
library(kableExtra)

setwd("~/Dropbox/Diaspora_Narratives/Submission/SecStud/Replication")


# Read dictionaries

diaspora <- read_excel("02_data/dictionary_diaspora.xlsx") |>
  mutate(Object = "Diaspora")

social_groups <- read_excel("02_data/dictionary_socialgroups.xlsx") |> 
  rename(Object = Type)

racism <- read_excel("02_data/dictionary_racism.xlsx") |>
  mutate(Object = "Racism")

violence <- read_excel("02_data/dictionary_violence.xlsx") |>
  mutate(Object = "Violence")

# Combine dictionaries
sg_d <- rbind(diaspora, social_groups, racism, violence) |>
  select(Term, Object, Definition) |>
  mutate(Definition = gsub(pattern = "CL.*", replacement = "", Definition)) |>
  arrange(Object)

# Produce Social Group Dictionary Table

## Latex version
sg_d |>
  select(-Object) |>
  kbl(format = "latex",
      caption = "Dictionaries for Identity Framing Analysis",
      label = "diaspora_groups",
      booktabs = TRUE,
      longtable = TRUE) |>
  kable_classic(full_width = FALSE) |>
  column_spec(2, width = "12cm") |>
  pack_rows("African Americans (Object)", 1, 3, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Diaspora (Object)", 4, 8, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Racism (Attribute)", 9, 15, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Violence (Attribute)", 16, 29, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("White Americans (Object)", 30, 31, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  kableExtra::footnote(general = "Notes: The following terms were removed as they were missing in 10% or more subcorpora: 侨务, 侨民, 侨胞, 华埠, 非洲裔美国人, 受害, 处决, and 敌对.", 
                       escape = TRUE, 
                       threeparttable = TRUE, 
                       general_title = "") |> 
  kable_styling(latex_options = c("repeat_header"),
                repeat_header_continued = "\\textit{(Continued on Next Page...)}") |>
  save_kable(file = "03_output/Tables/diaspora_dict.tex")

## Word version
sg_d |>
  select(-Object) |>
  kbl(format = "html",
      caption = "Dictionaries for Identity Framing Analysis",
      label = "diaspora_groups",
      booktabs = TRUE,
      longtable = TRUE) |>
  kable_classic(full_width = FALSE) |>
  column_spec(2, width = "12cm") |>
  pack_rows("African Americans (Object)", 1, 3, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Diaspora (Object)", 4, 8, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Racism (Attribute)", 9, 15, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Violence (Attribute)", 16, 29, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("White Americans (Object)", 30, 31, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  kableExtra::footnote(general = "Notes: The following terms were removed as they were missing in 10% or more subcorpora: 侨务, 侨民, 侨胞, 华埠, 非洲裔美国人, 受害, 处决, and 敌对.", 
                       escape = TRUE, 
                       threeparttable = TRUE, 
                       general_title = "") |> 
  kable_styling(latex_options = c("repeat_header"),
                repeat_header_continued = "\\textit{(Continued on Next Page...)}")


# Read country dictionaries
countries <- read_excel("02_data/dictionary_countrynames.xlsx") |>
  select(Term = `Chinese Name`, Definition = `English Name`) |>
  mutate(Term = strsplit(Term, "\\|"), Object = "Country",
         Definition = strsplit(Definition, "\\|") |> map(1) |> unlist()) |>
  unnest(cols = Term)

neg_lig <- read_excel("02_data/dictionary_countryattributes.xlsx") |>
  filter(Type %in% c("Chaos", "Corruption", "Sports")) |>
  select(Object = Type, Term, Definition) |>
  mutate(Definition = gsub(pattern = "CL.*", replacement = "", Definition),
         Term = gsub("\\s", "", Term))

cn_d <- rbind(countries, neg_lig) |>
  select(Term, Object, Definition) |>
  arrange(Object)

# Produce Political Wedge Dictionary Tables

## Latex version
cn_d |>
  select(-Object) |>
  kbl(format = "latex",
      caption = "Dictionaries for Political Wedge Analysis",
      label = "neglig_groups",
      booktabs = TRUE,
      longtable = TRUE) |>
  kable_classic(full_width = FALSE) |>
  column_spec(2, width = "12cm") |>
  pack_rows("Chaos (Attribute)", 1, 39, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Corruption (Attribute)", 40, 64, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Country (Object)", 65, 323, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Sports (Attribute)", 324, 386, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  kableExtra::footnote(general = "Notes: The following terms were removed as they were missing in 10% or more subcorpora: 侨务, 侨民, 侨胞, 华埠, 非洲裔美国人, 受害, 处决, and 敌对.", 
                       escape = TRUE, 
                       threeparttable = TRUE, 
                       general_title = "") |> 
  kable_styling(latex_options = c("repeat_header"),
                repeat_header_continued = "\\textit{(Continued on Next Page...)}") |>
  save_kable(file = "03_output/Tables/country_dict.tex")

## Word version
cn_d |>
  select(-Object) |>
  kbl(format = "html",
      caption = "Dictionaries for Political Wedge Analysis",
      label = "neglig_groups",
      booktabs = TRUE,
      longtable = TRUE) |>
  kable_classic(full_width = FALSE) |>
  column_spec(2, width = "12cm") |>
  pack_rows("Chaos (Attribute)", 1, 39, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Corruption (Attribute)", 40, 64, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Country (Object)", 65, 323, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Sports (Attribute)", 324, 386, bold = F, italic = T, indent = FALSE, label_row_css = "", latex_gap_space = "") |>
  kableExtra::footnote(general = "Notes: The following terms were removed as they were missing in 10% or more subcorpora: 侨务, 侨民, 侨胞, 华埠, 非洲裔美国人, 受害, 处决, and 敌对.", 
                       escape = TRUE, 
                       threeparttable = TRUE, 
                       general_title = "") |> 
  kable_styling(latex_options = c("repeat_header"),
                repeat_header_continued = "\\textit{(Continued on Next Page...)}")


# Summary Table

df_sum <- fread(file = "02_data/sum_data.csv")

df_sum |>
  group_by(`English Name`) |>
  pivot_wider(id_cols = c(`English Name`, `Chinese Name`), values_from = `Total Articles`, names_from = Year) |>
  rename(English = `English Name`, Chinese = `Chinese Name`) |>
  kbl(row.names = FALSE,
      booktabs = TRUE,
      label = "metadata",
      caption = "WeChat Subscription Account Metadata",
      format = "latex") |>
  kable_styling(latex_options = c("hold_position")) |>
  pack_rows("Government", 1, 8, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Independent", 9, 14, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") |>
  add_header_above(c("Subscription Account Name" = 2, "Articles by Year" = 3)) |>
  save_kable(file = "03_output/Tables/acct_metadata.tex", keep_tex = TRUE)


df_sum |>
  group_by(`English Name`) |>
  pivot_wider(id_cols = c(`English Name`, `Chinese Name`), values_from = `Total Articles`, names_from = Year) |>
  rename(English = `English Name`, Chinese = `Chinese Name`) |>
  kbl(row.names = FALSE,
      booktabs = TRUE,
      label = "metadata",
      caption = "WeChat Subscription Account Metadata",
      format = "html") |>
  kable_styling(latex_options = c("hold_position")) |>
  pack_rows("Government", 1, 8, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") |>
  pack_rows("Independent", 9, 14, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") |>
  add_header_above(c("Subscription Account Name" = 2, "Articles by Year" = 3))
